library(corrplot)
## corrplot 0.84 loaded
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Jordan
# save the data separately
jh_data <- read.csv(file = 'NBA_salary_stats.csv', row.names = 1, header = TRUE)
# Correlation plot (Khoi's code)
corrplot(cor(jh_data[, -c(1, 3, 4, 30, 38, 45)],
# corrplot(cor(jh_data[c(2,10)],
use = 'complete.obs'),
method = 'circle',
type = 'upper')
# Just correlation of all vars with y
correlations<-cor(jh_data[, -c(1, 3, 4, 30, 38, 45)])[ ,1]
# save the highly correlated variables (above .5 is my threshold)
high_cor <- Filter(function(x) x>.5,correlations)
# remove the y variable
high_cor<-high_cor[2:length(high_cor)]
# Loop through all highly correlated vars to get regressions and plots
for (c in names(high_cor)) {
# get the model for each highly correlated predictor
l<-lm(jh_data$Actual~jh_data[[c]])
# plot the model with the regression line
plot(jh_data$Actual~jh_data[[c]], main=c)
abline(l,col='red')
# residual plots
# these residual plots look really good. The only note is that the higher end errors are a little bigger making the center above the clear mass of residuals in most plots.
plot(residuals(l), main = c)
abline(h=0)
}
# see chap 8 and chap 3
for (c in 1:ncol(jh_data)) {
(is.numeric(jh_data[c]))
}
### Khoi
# exploratory data analysis
## reading in data
kt_data <- read.csv(file = 'NBA_salary_stats.csv',
row.names = 1,
header = TRUE)
# drop Guaranteed salary in exchange for using 'X2019.20' (actual pay) when predictingsalary (index = 3rd column)
### correlation plots, 2019-20 season performance stats
## correlation plots for total point scoring
corrplot(cor(subset(kt_data, select = c('Actual', 'MP', 'G', 'GS', 'X2P', 'X3P', 'FT',
'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF')),
use = 'complete.obs'),
method = 'circle',
type = 'upper')
## correlation plots with accuracy (e.g. 3-point percentage)
corrplot(cor(subset(kt_data, select = c('Actual', 'MP', 'G', 'GS.', 'X2P.', 'X3P.', 'FT.',
'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF')),
use = 'complete.obs'),
method = 'circle',
type = 'upper')
### correlation plots, overall career statistics
# e.g. mid-season trades, experience, age, etc.
corrplot(cor(subset(kt_data, select = c('Actual', 'yrs_exp', 'age_yrs', 'draft_num',
'num_accolades', 'NBA_titles', 'all_NBA', 'num_teams_1920',
'mid_season_trades', 'num_yrs_current_team',
'num_teams_played', 'injury_yrs', 'twitter_followers')),
use = 'complete.obs'),
method = 'circle',
type = 'upper')
## list variables most correlated to actual salary
cor(subset(kt_data, select = c('Actual', 'MP', 'G', 'GS', 'X2P', 'X3P', 'FT',
'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF')))[,1]
## Actual MP G GS X2P X3P FT ORB
## 1.0000000 0.4776307 0.2580372 0.5198420 0.5622481 0.3826085 0.5813015 0.3301995
## DRB AST STL BLK TOV PF
## 0.5018673 0.5243857 0.4094649 0.3252348 0.5551408 0.3668231
# GS, X2P, FT, DRB, AST, TOV are > 0.5
# MP, X3P, ORB, STL, BLK, PF > 0.3
cor(subset(kt_data, select = c('Actual', 'MP', 'G', 'GS.', 'X2P.', 'X3P.', 'FT.',
'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF')))[,1]
## Actual MP G GS. X2P. X3P. FT.
## 1.00000000 0.47763072 0.25803717 0.55611939 0.05032016 0.11237290 0.11830423
## ORB DRB AST STL BLK TOV PF
## 0.33019950 0.50186730 0.52438573 0.40946493 0.32523475 0.55514079 0.36682313
# GS., DRB, AST, TOV are > 0.5
# MP, ORB, STL, BLK, PF > 0.3
cor(subset(kt_data, select = c('Actual', 'yrs_exp', 'age_yrs', 'draft_num',
'num_accolades', 'NBA_titles', 'all_NBA', 'num_teams_1920',
'mid_season_trades', 'num_yrs_current_team',
'num_teams_played', 'injury_yrs', 'twitter_followers')))[,1]
## Actual yrs_exp age_yrs
## 1.00000000 0.50866761 0.37333296
## draft_num num_accolades NBA_titles
## -0.09161885 0.58950054 0.12658476
## all_NBA num_teams_1920 mid_season_trades
## 0.47315504 0.01393671 0.01393671
## num_yrs_current_team num_teams_played injury_yrs
## 0.34030605 0.06716042 0.16221740
## twitter_followers
## 0.30122396
# yrs_exp, num_accolades are 0.5
# age_yrs, all_NBA, num_yrs_current_team, twitter_followers > 0.3
# Jordan
# automated search procedures
# !! Do we want to split the data and do crosswise regression?
##intercept only model
regnull <- lm(jh_data$Actual~1, data=jh_data)
##model with all predictors
regfull <- lm(Actual~.-Player-Guaranteed-url, data=jh_data)
# step_reg <- step(regnull, scope=list(lower=regnull, upper=regfull), direction="both")
# model evaluation see chapters 5 and 7
library(faraway)
##
## Attaching package: 'faraway'
## The following object is masked from 'package:GGally':
##
## happy
jh_result<-lm(Actual ~ num_accolades + PTS + yrs_exp + num_teams_played +
GS. + NBA_titles + num_teams_1920 + PF + DRB + GS + eFG. +
all_NBA, data = jh_data)
summary(jh_result)
##
## Call:
## lm(formula = Actual ~ num_accolades + PTS + yrs_exp + num_teams_played +
## GS. + NBA_titles + num_teams_1920 + PF + DRB + GS + eFG. +
## all_NBA, data = jh_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24942709 -2311847 -36118 2071513 18876940
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1757364.4 713656.5 -2.462 0.014173 *
## num_accolades 704532.2 294578.3 2.392 0.017183 *
## PTS 5865.4 967.9 6.060 2.89e-09 ***
## yrs_exp 1249579.0 100419.8 12.444 < 2e-16 ***
## num_teams_played -1544257.3 201325.6 -7.670 1.07e-13 ***
## GS. 8659338.7 1696598.0 5.104 4.92e-07 ***
## NBA_titles -1719365.4 542477.4 -3.169 0.001632 **
## num_teams_1920 3667862.2 1055950.9 3.474 0.000564 ***
## PF -25757.8 7395.8 -3.483 0.000545 ***
## DRB 9673.4 3450.7 2.803 0.005277 **
## GS -74325.6 31020.7 -2.396 0.016983 *
## eFG. -3911371.9 2049989.6 -1.908 0.057029 .
## all_NBA 447025.3 288252.4 1.551 0.121651
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5124000 on 449 degrees of freedom
## Multiple R-squared: 0.6633, Adjusted R-squared: 0.6543
## F-statistic: 73.71 on 12 and 449 DF, p-value: < 2.2e-16
anova(jh_result)
## Analysis of Variance Table
##
## Response: Actual
## Df Sum Sq Mean Sq F value Pr(>F)
## num_accolades 1 1.2165e+16 1.2165e+16 463.4195 < 2.2e-16 ***
## PTS 1 4.7451e+15 4.7451e+15 180.7565 < 2.2e-16 ***
## yrs_exp 1 2.1762e+15 2.1762e+15 82.8976 < 2.2e-16 ***
## num_teams_played 1 2.0693e+15 2.0693e+15 78.8283 < 2.2e-16 ***
## GS. 1 6.1726e+14 6.1726e+14 23.5139 1.712e-06 ***
## NBA_titles 1 4.0733e+14 4.0733e+14 15.5165 9.480e-05 ***
## num_teams_1920 1 2.5231e+14 2.5231e+14 9.6115 0.0020552 **
## PF 1 3.9256e+14 3.9256e+14 14.9540 0.0001264 ***
## DRB 1 1.1701e+14 1.1701e+14 4.4573 0.0353047 *
## GS 1 1.1810e+14 1.1810e+14 4.4987 0.0344673 *
## eFG. 1 9.6619e+13 9.6619e+13 3.6806 0.0556839 .
## all_NBA 1 6.3134e+13 6.3134e+13 2.4050 0.1216513
## Residuals 449 1.1787e+16 2.6251e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vif(jh_result)
## num_accolades PTS yrs_exp num_teams_played
## 3.774549 3.650018 3.014659 2.590077
## GS. NBA_titles num_teams_1920 PF
## 8.291112 1.376388 8.998956 4.614315
## DRB GS eFG. all_NBA
## 4.463431 12.166038 9.453790 2.388246
# multicolinearity appears to be an issue with GS and potentially eFG., num_teams_1920, and GS.
#correlation between variables?
vcov(jh_result)[,c('GS','eFG.',"num_teams_1920","GS.")]
## GS eFG. num_teams_1920 GS.
## (Intercept) 9763583887 1.860193e+11 -2.540477e+11 -632128174737
## num_accolades 1608937133 1.932269e+10 -8.122633e+09 -98031547225
## PTS -7837783 -1.226421e+07 -6.260376e+05 121083911
## yrs_exp 160994678 -1.740717e+09 6.798448e+09 -16301658003
## num_teams_played -483040741 -2.024686e+10 -1.268261e+10 48870659815
## GS. -45352232117 -3.379549e+11 3.368037e+11 2878444861631
## NBA_titles -1856407142 -3.233093e+10 1.360718e+10 85831133768
## num_teams_1920 -5417073792 -1.984803e+12 1.115032e+12 336803745321
## PF -51900343 -2.427541e+09 5.363480e+08 1662852170
## DRB -24464050 -8.514083e+08 3.845955e+08 855183532
## GS 962283269 8.131469e+09 -5.417074e+09 -45352232117
## eFG. 8131469194 4.202458e+12 -1.984803e+12 -337954901794
## all_NBA -275278304 3.978419e+09 -1.311960e+09 30661898053
From the VCOV table, looking at rows specifically: GS -
# let's try dropping GS (highest VIF) and eFG. (also a major one) and see how things change
jh_result.2<-lm(Actual ~ num_accolades + PTS + yrs_exp + num_teams_played +
GS. + NBA_titles + num_teams_1920 + PF + DRB +
all_NBA, data = jh_data)
summary(jh_result.2)
##
## Call:
## lm(formula = Actual ~ num_accolades + PTS + yrs_exp + num_teams_played +
## GS. + NBA_titles + num_teams_1920 + PF + DRB + all_NBA, data = jh_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24725614 -2278176 -64771 1993189 18498666
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -921017.9 642952.5 -1.432 0.152699
## num_accolades 833353.3 291996.3 2.854 0.004516 **
## PTS 5302.9 940.5 5.638 3.04e-08 ***
## yrs_exp 1259550.9 100977.3 12.474 < 2e-16 ***
## num_teams_played -1594406.2 201971.7 -7.894 2.24e-14 ***
## GS. 5192350.9 866562.5 5.992 4.25e-09 ***
## NBA_titles -1875971.8 542885.4 -3.456 0.000601 ***
## num_teams_1920 1709343.9 421254.8 4.058 5.84e-05 ***
## PF -31346.3 7187.4 -4.361 1.60e-05 ***
## DRB 7271.8 3367.9 2.159 0.031363 *
## all_NBA 430770.6 290119.8 1.485 0.138295
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5160000 on 451 degrees of freedom
## Multiple R-squared: 0.657, Adjusted R-squared: 0.6494
## F-statistic: 86.4 on 10 and 451 DF, p-value: < 2.2e-16
# Adj R2 is about the same
anova(jh_result.2)
## Analysis of Variance Table
##
## Response: Actual
## Df Sum Sq Mean Sq F value Pr(>F)
## num_accolades 1 1.2165e+16 1.2165e+16 456.9866 < 2.2e-16 ***
## PTS 1 4.7451e+15 4.7451e+15 178.2474 < 2.2e-16 ***
## yrs_exp 1 2.1762e+15 2.1762e+15 81.7469 < 2.2e-16 ***
## num_teams_played 1 2.0693e+15 2.0693e+15 77.7341 < 2.2e-16 ***
## GS. 1 6.1726e+14 6.1726e+14 23.1874 2.009e-06 ***
## NBA_titles 1 4.0733e+14 4.0733e+14 15.3011 0.0001058 ***
## num_teams_1920 1 2.5231e+14 2.5231e+14 9.4781 0.0022062 **
## PF 1 3.9256e+14 3.9256e+14 14.7465 0.0001406 ***
## DRB 1 1.1701e+14 1.1701e+14 4.3954 0.0365922 *
## all_NBA 1 5.8689e+13 5.8689e+13 2.2046 0.1382947
## Residuals 451 1.2006e+16 2.6621e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vif(jh_result.2)
## num_accolades PTS yrs_exp num_teams_played
## 3.657189 3.398566 3.005911 2.570543
## GS. NBA_titles num_teams_1920 PF
## 2.132965 1.359324 1.412289 4.297441
## DRB all_NBA
## 4.192612 2.385706
# much better VIF
#partial f test, if this has p>0.05 we reject null and keep full model.
# remove all_NBA
reduced.1 <- lm(Actual~num_accolades + PTS + yrs_exp + num_teams_played +
GS. + NBA_titles + num_teams_1920 + PF + DRB, data = jh_data)
anova(reduced.1,jh_result.2)
## Analysis of Variance Table
##
## Model 1: Actual ~ num_accolades + PTS + yrs_exp + num_teams_played + GS. +
## NBA_titles + num_teams_1920 + PF + DRB
## Model 2: Actual ~ num_accolades + PTS + yrs_exp + num_teams_played + GS. +
## NBA_titles + num_teams_1920 + PF + DRB + all_NBA
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 452 1.2065e+16
## 2 451 1.2006e+16 1 5.8689e+13 2.2046 0.1383
summary(reduced.1)
##
## Call:
## lm(formula = Actual ~ num_accolades + PTS + yrs_exp + num_teams_played +
## GS. + NBA_titles + num_teams_1920 + PF + DRB, data = jh_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23147747 -2217941 1273 2103629 18287198
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -972505.9 642871.7 -1.513 0.131041
## num_accolades 1114547.8 222548.9 5.008 7.90e-07 ***
## PTS 5339.4 941.5 5.671 2.53e-08 ***
## yrs_exp 1266141.9 101014.0 12.534 < 2e-16 ***
## num_teams_played -1613655.0 201823.6 -7.995 1.09e-14 ***
## GS. 5100909.3 865522.4 5.893 7.40e-09 ***
## NBA_titles -1963533.7 540391.6 -3.634 0.000312 ***
## num_teams_1920 1708847.2 421815.6 4.051 6.00e-05 ***
## PF -31102.9 7195.1 -4.323 1.90e-05 ***
## DRB 7054.1 3369.1 2.094 0.036840 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5166000 on 452 degrees of freedom
## Multiple R-squared: 0.6554, Adjusted R-squared: 0.6485
## F-statistic: 95.5 on 9 and 452 DF, p-value: < 2.2e-16
# we fail to reject the null so let's go with the smaller model
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
# Do we need to transform predictor? If lambda=1 is in the confidence interval we do not need a transformation, if 0 is in CI then us ln(y), otherwise use y^lambda (see pg 17 in notes)
boxcox(jh_result)
# check distribution of residuals
plot(residuals(jh_result), main = c)
abline(h=0,col='red')
# check normal distribution
qqnorm(jh_result$residuals)
qqline(jh_result$residuals, col="red")
# autocorrelation
acf(jh_result$residuals, main="ACF of Residuals")
Questions: 1. Is it a problem if we don’t have categorical variables? 2. VIF suggests we have some multicolinearity, do we drop those with high VIF’s? 3. Do we want to error on the side of more predictors (less bias) or fewer predictors (less variance)? 4. Should we do cross validation? 5. Should we check AIC/BIC/Press statistic (43) 6. Test predictive ability? 7. Should we try to get better linear fits by dealing with outliers? 8. (partial regression plot, pg. 48, module 8)
jh_data$Pos<-factor(jh_data$Pos)
is.factor(jh_data$Pos)
## [1] TRUE
levels(jh_data$Pos)
## [1] "C" "C-PF" "PF" "PF-C" "PF-SF" "PG" "PG-SG" "SF" "SF-C"
## [10] "SF-PF" "SF-SG" "SG" "SG-PG" "SG-SF"
contrasts(jh_data$Pos)
## C-PF PF PF-C PF-SF PG PG-SG SF SF-C SF-PF SF-SG SG SG-PG SG-SF
## C 0 0 0 0 0 0 0 0 0 0 0 0 0
## C-PF 1 0 0 0 0 0 0 0 0 0 0 0 0
## PF 0 1 0 0 0 0 0 0 0 0 0 0 0
## PF-C 0 0 1 0 0 0 0 0 0 0 0 0 0
## PF-SF 0 0 0 1 0 0 0 0 0 0 0 0 0
## PG 0 0 0 0 1 0 0 0 0 0 0 0 0
## PG-SG 0 0 0 0 0 1 0 0 0 0 0 0 0
## SF 0 0 0 0 0 0 1 0 0 0 0 0 0
## SF-C 0 0 0 0 0 0 0 1 0 0 0 0 0
## SF-PF 0 0 0 0 0 0 0 0 1 0 0 0 0
## SF-SG 0 0 0 0 0 0 0 0 0 1 0 0 0
## SG 0 0 0 0 0 0 0 0 0 0 1 0 0
## SG-PG 0 0 0 0 0 0 0 0 0 0 0 1 0
## SG-SF 0 0 0 0 0 0 0 0 0 0 0 0 1
a1<-subset(jh_data,Pos=="C")
a2<-subset(jh_data,Pos=="C-PF")
a3<-subset(jh_data,Pos=="PF")
a4<-subset(jh_data,Pos=="PF-C")
a5<-subset(jh_data,Pos=="PF-SF")
a6<-subset(jh_data,Pos=="PG")
a7<-subset(jh_data,Pos=="PG-SG")
a8<-subset(jh_data,Pos=="SF")
a9<-subset(jh_data,Pos=="SF-C")
a10<-subset(jh_data,Pos=="SF-PF")
a11<-subset(jh_data,Pos=="SF-SG")
a12<-subset(jh_data,Pos=="SG")
a13<-subset(jh_data,Pos=="SG-PG")
a14<-subset(jh_data,Pos=="SG-SF")
reg1<-lm(Guaranteed~PTS,data=a1)
reg2<-lm(Guaranteed~PTS,data=a2)
reg3<-lm(Guaranteed~PTS,data=a3)
reg4<-lm(Guaranteed~PTS,data=a4)
reg5<-lm(Guaranteed~PTS,data=a5)
reg6<-lm(Guaranteed~PTS,data=a6)
reg7<-lm(Guaranteed~PTS,data=a7)
reg8<-lm(Guaranteed~PTS,data=a8)
reg9<-lm(Guaranteed~PTS,data=a9)
reg10<-lm(Guaranteed~PTS,data=a10)
reg11<-lm(Guaranteed~PTS,data=a11)
reg12<-lm(Guaranteed~PTS,data=a12)
reg13<-lm(Guaranteed~PTS,data=a13)
reg14<-lm(Guaranteed~PTS,data=a14)
plot(jh_data$PTS,jh_data$Guaranteed, main="Guaranteed against PTS, by Pos")
points(a2$PTS,a2$Guaranteed, pch=2, col="red")
points(a3$PTS,a3$Guaranteed, pch=3, col="blue")
points(a4$PTS,a4$Guaranteed, pch=4, col="green")
points(a5$PTS,a5$Guaranteed, pch=5, col="orange")
points(a6$PTS,a6$Guaranteed, pch=6, col="purple")
points(a7$PTS,a7$Guaranteed, pch=7, col="pink")
points(a8$PTS,a8$Guaranteed, pch=8, col="seagreen2")
points(a9$PTS,a9$Guaranteed, pch=9, col="salmon3")
points(a10$PTS,a10$Guaranteed, pch=10, col="royalblue4")
points(a11$PTS,a11$Guaranteed, pch=11, col="rosybrown4")
points(a12$PTS,a12$Guaranteed, pch=12, col="skyblue2")
points(a13$PTS,a13$Guaranteed, pch=13, col="slateblue2")
points(a14$PTS,a14$Guaranteed, pch=14, col="slategray2")
abline(reg1,lty=1)
# abline(reg2,lty=2, col="red")
abline(reg3,lty=3, col="blue")
abline(reg4,lty=4, col="green")
abline(reg5,lty=5, col="orange")
abline(reg6,lty=6, col="purple")
# abline(reg7,lty=7, col="pink")
abline(reg8,lty=8, col="seagreen2")
# abline(reg9,lty=9, col="salmon3")
abline(reg10,lty=10, col="royalblue4")
# abline(reg11,lty=11, col="rosybrown4")
abline(reg12,lty=12, col="skyblue2")
abline(reg13,lty=13, col="slateblue2")
# abline(reg14,lty=14, col="slategray2")